library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(tidyverse)
library(stringr)
library(caret)
library(fpp2)
library(dygraphs)
library(xts)
library(pander)
library(purrr)
library(ggthemes)
library(gridExtra)
library(cowplot)
library(RColorBrewer)
library(gplots)
library(corrplot)
library(functional)
library(fastNaiveBayes)
library(ggpubr)
dfdata<- readr::read_csv("CaseStudy2-data.csv")
head(dfdata)
dfdata %>% count(Attrition) ->att
att
You can also embed plots, for example:
agePlot <- ggplot(dfdata,aes(Age,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
agePlot
travelPlot <- ggplot(dfdata,aes(BusinessTravel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
travelPlot
ratePlot <- ggplot(dfdata,aes(DailyRate, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
ratePlot
depPlot <- ggplot(dfdata,aes(Department,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
depPlot
distPlot<- ggplot(dfdata,aes(DistanceFromHome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
distPlot
eduPlot <- ggplot(dfdata,aes(Education,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
eduPlot
edufieldPlot <- ggplot(dfdata,aes(EducationField,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
edufieldPlot
envPlot <- ggplot(dfdata,aes(EnvironmentSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
envPlot
genPlot <- ggplot(dfdata,aes(Gender,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
genPlot
hourlyPlot <- ggplot(dfdata,aes(HourlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
hourlyPlot
jobInvPlot <- ggplot(dfdata,aes(JobInvolvement,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #important
jobInvPlot
jobLevelPlot <- ggplot(dfdata,aes(JobLevel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobLevelPlot
jobSatPlot <- ggplot(dfdata,aes(JobSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobSatPlot
overTimePlot <- ggplot(dfdata,aes(OverTime,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
overTimePlot
hikePlot <- ggplot(dfdata,aes(PercentSalaryHike, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
hikePlot
perfPlot <- ggplot(dfdata,aes(PerformanceRating,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
perfPlot
RelSatPlot <- ggplot(dfdata,aes(RelationshipSatisfaction,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
RelSatPlot
StockPlot <- ggplot(dfdata,aes(StockOptionLevel,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
StockPlot
workingYearsPlot <- ggplot(dfdata,aes(TotalWorkingYears,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
workingYearsPlot
TrainTimesPlot <- ggplot(dfdata,aes(TrainingTimesLastYear,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
TrainTimesPlot
WLBPlot<- ggplot(dfdata,aes(WorkLifeBalance,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
WLBPlot
marPlot <- ggplot(dfdata,aes(MaritalStatus,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")#maybe
marPlot
monthlyIncPlot <- ggplot(dfdata,aes(MonthlyIncome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
monthlyIncPlot
monthlyRatePlot <- ggplot(dfdata,aes(MonthlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
monthlyRatePlot
numCompPlot <- ggplot(dfdata,aes(NumCompaniesWorked,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
numCompPlot
YearAtComPlot <- ggplot(dfdata,aes(YearsAtCompany,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearAtComPlot
YearInCurrPlot <- ggplot(dfdata,aes(YearsInCurrentRole,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
YearInCurrPlot
YearsSinceProm <- ggplot(dfdata,aes(YearsSinceLastPromotion,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
YearsSinceProm
YearsCurrManPlot <- ggplot(dfdata,aes(YearsWithCurrManager,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearsCurrManPlot
myplot <- ggplot(dfdata, aes(BusinessTravel, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(Department, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(Education, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(EducationField, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(EnvironmentSatisfaction, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(Gender, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(JobInvolvement, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(JobLevel, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(JobSatisfaction, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(OverTime, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(RelationshipSatisfaction, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(StockOptionLevel, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(WorkLifeBalance, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(MaritalStatus, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(YearsInCurrentRole, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot #maybe
myplot <- ggplot(dfdata, aes(YearsWithCurrManager, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important
myplot <- ggplot(dfdata, aes(YearsAtCompany, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # important/maybe
myplot <- ggplot(dfdata, aes(NumCompaniesWorked, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(TrainingTimesLastYear, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(TotalWorkingYears, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(DistanceFromHome, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot # maybe
myplot <- ggplot(dfdata, aes(Age, group = Attrition)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~Attrition)
myplot
myplot <- ggplot(dfdata, aes(EducationField, group = JobSatisfaction )) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_grid(~JobSatisfaction)
myplot
ys <- names(dfdata)[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36)]
ys %>% map(function(y)
ggplot(dfdata , aes(MonthlyIncome)) + geom_point(aes_string(y=y)))
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
#### More exploratory data for montly income with heatmap for numerical values
dfn <-dfdata[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36 )]
my_palette <- colorRampPalette(c("red", "white", "black"))
heatmapper <- function(df){
df %>%
keep(is.numeric) %>%
tidyr::drop_na() %>%
cor %>%
heatmap.2(col = my_palette ,
density.info = "none", trace = "none",
dendogram = c("both"), symm = F,
symkey = T, symbreaks = T, scale = "none",
key = T)
}
heatmapper(dfn)
correlator <- function(df){
df %>%
keep(is.numeric) %>%
tidyr::drop_na() %>%
cor %>%
corrplot( addCoef.col = "white", number.digits = 2,
number.cex = 0.5, method="square",
order="hclust", title="Variable Corr Heatmap",
tl.srt=45, tl.cex = 0.8)
}
correlator(dfn)
#### More exploratory data for montly income with ggplot numeric for numerical values
plotAllNumeric <- function(df){
df%>%keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()+geom_histogram() + theme_fivethirtyeight()
}
plotAllNumeric(dfn)
dfdata %>% keep(is.factor) %>% names -> label
ggplot(data = dfdata, aes(x = BusinessTravel, y = MonthlyIncome, fill =BusinessTravel )) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = Department, y = MonthlyIncome, fill = Department)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(Education), y = MonthlyIncome, fill = as.factor(Education))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important
ggplot(data = dfdata, aes(x = EducationField, y = MonthlyIncome, fill = EducationField)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(EnvironmentSatisfaction), y = MonthlyIncome, fill = as.factor(EnvironmentSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = Gender, y = MonthlyIncome, fill = Gender)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(JobInvolvement), y = MonthlyIncome, fill = as.factor(JobInvolvement))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(JobLevel), y = MonthlyIncome, fill = as.factor(JobLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important
ggplot(data = dfdata, aes(x = JobRole, y = MonthlyIncome, fill = JobRole)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()# important
ggplot(data = dfdata, aes(x = as.factor(JobSatisfaction), y = MonthlyIncome, fill = as.factor(JobSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = MaritalStatus, y = MonthlyIncome, fill = MaritalStatus)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = OverTime, y = MonthlyIncome, fill = OverTime)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(PerformanceRating), y = MonthlyIncome, fill = as.factor(PerformanceRating))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() #maybe
ggplot(data = dfdata, aes(x = as.factor(RelationshipSatisfaction), y = MonthlyIncome, fill = as.factor(RelationshipSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
ggplot(data = dfdata, aes(x = as.factor(StockOptionLevel), y = MonthlyIncome, fill = as.factor(StockOptionLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # maybe
ggplot(data = dfdata, aes(x = as.factor(WorkLifeBalance), y = MonthlyIncome, fill =as.factor(WorkLifeBalance))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
dfdata$YearsInCurrentRole<- cut(as.numeric(dfdata$YearsInCurrentRole), breaks = c(-1,1,100))
dfdata$TotalWorkingYears<- cut(as.numeric(dfdata$TotalWorkingYears), breaks = c(-1,1,100))
dfdata$YearsWithCurrManager<- cut(as.numeric(dfdata$YearsWithCurrManager), breaks = c(-1,1,100))
dfdata %>% filter(MaritalStatus %in% c("Single", "Divorce"))-> MaritalStatus
dfdata$MaritalStatus <- as.factor(dfdata$MaritalStatus)
levels(dfdata$MaritalStatus) <- c("NotMarried", "Married", "NotMarried")
clasy<- dfdata[c("Attrition", "StockOptionLevel", "JobLevel", "MonthlyIncome", "OverTime")]
clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")] <- lapply(clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(clasy)
set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
train_ind <- createDataPartition(df[[1]], p = p, list = list)
cat("creating training dataset...\n")
training <<- df[train_ind, ]
cat("completed training dataset, creating test set\n")
test <<- df[-train_ind, ]
cat("done")
}
split(clasy)
## creating training dataset...
## completed training dataset, creating test set
## done
library(doParallel)
numcores <- parallel::detectCores() - 1
cl <- makePSOCKcluster(numcores)
registerDoParallel(cl)
set.seed(3333)
trainMethod <- trainControl( method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)
fit.nb1 <- train(Attrition ~ ., data = training, method = "nb", metric = "Spec", trControl = trainMethod)
fit.nb1
## Naive Bayes
##
## 653 samples
## 4 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times)
## Summary of sample sizes: 627, 626, 627, 628, 626, 626, ...
## Resampling results across tuning parameters:
##
## usekernel ROC Sens Spec
## FALSE 0.7232026 0.7445195 0.6584
## TRUE 0.7668199 0.9992727 0.0000
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Spec was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE
## and adjust = 1.
plot(fit.nb1)
test_pred <- predict(fit.nb1, newdata = test)
test_pred
## [1] No Yes No Yes Yes No No No No Yes No No No No No No Yes
## [18] No Yes No No Yes Yes Yes Yes No No No No No No No Yes No
## [35] No No Yes No No No No No No No No No No No No No No
## [52] No No No No Yes No No Yes No No No Yes No No No Yes No
## [69] No Yes No No No No Yes No Yes Yes No Yes No No No Yes No
## [86] Yes No Yes No Yes No No No Yes No Yes No No No No Yes No
## [103] Yes No Yes No No No Yes No No Yes No No Yes No No No No
## [120] No Yes Yes Yes No No No No Yes No No Yes Yes Yes No Yes Yes
## [137] Yes No No No No No No Yes No No No No Yes Yes Yes Yes No
## [154] No Yes No Yes No No Yes No No No No No No No Yes No No
## [171] Yes Yes No Yes No No No No Yes Yes Yes No Yes No No Yes Yes
## [188] Yes Yes No No No No No No Yes Yes No No Yes No Yes No Yes
## [205] No No Yes No No No Yes No No No No No No
## Levels: No Yes
confusionMatrix(test_pred, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 136 11
## Yes 46 24
##
## Accuracy : 0.7373
## 95% CI : (0.6735, 0.7946)
## No Information Rate : 0.8387
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.3084
##
## Mcnemar's Test P-Value : 6.687e-06
##
## Sensitivity : 0.7473
## Specificity : 0.6857
## Pos Pred Value : 0.9252
## Neg Pred Value : 0.3429
## Prevalence : 0.8387
## Detection Rate : 0.6267
## Detection Prevalence : 0.6774
## Balanced Accuracy : 0.7165
##
## 'Positive' Class : No
##
stopCluster(cl)
dfdata<- readr::read_csv("CaseStudy2-data.csv")
reg <- dfdata[c("TotalWorkingYears", "YearsAtCompany", "Age", "Education", "JobLevel", "JobRole", "MonthlyIncome", "PerformanceRating", "StockOptionLevel" )]
reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)
reg
set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
train_ind <- createDataPartition(df[[1]], p = p, list = list)
cat("creating training dataset...\n")
training <<- df[train_ind, ]
cat("completed training dataset, creating test set\n")
test <<- df[-train_ind, ]
cat("done")
}
split(reg)
## creating training dataset...
## completed training dataset, creating test set
## done
regincome1 <- lm( MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole, data = training)
summary(regincome1)
##
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3085.0 -647.5 -97.3 638.3 4284.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3642.521 220.489 16.520 < 2e-16 ***
## TotalWorkingYears 53.411 9.134 5.847 7.96e-09 ***
## JobLevel2 1598.495 170.893 9.354 < 2e-16 ***
## JobLevel3 4650.058 226.324 20.546 < 2e-16 ***
## JobLevel4 7872.592 345.320 22.798 < 2e-16 ***
## JobLevel5 10669.268 394.495 27.045 < 2e-16 ***
## JobRoleHuman Resources -1242.551 320.507 -3.877 0.000117 ***
## JobRoleLaboratory Technician -1309.554 212.884 -6.151 1.35e-09 ***
## JobRoleManager 3557.429 293.615 12.116 < 2e-16 ***
## JobRoleManufacturing Director 239.931 187.576 1.279 0.201319
## JobRoleResearch Director 3701.321 252.276 14.672 < 2e-16 ***
## JobRoleResearch Scientist -1091.211 216.239 -5.046 5.87e-07 ***
## JobRoleSales Executive 8.403 162.191 0.052 0.958697
## JobRoleSales Representative -1399.737 261.799 -5.347 1.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1036 on 641 degrees of freedom
## Multiple R-squared: 0.9504, Adjusted R-squared: 0.9494
## F-statistic: 944 on 13 and 641 DF, p-value: < 2.2e-16
pred <- predict(regincome1, newdata = test)
str(pred)
## Named num [1:215] 9067 9049 2872 9013 5668 ...
## - attr(*, "names")= chr [1:215] "1" "2" "3" "4" ...
ASA2 <- mean((pred[1:nrow(test)] - test$MonthlyIncome)^2)
sqrt(ASA2)
## [1] 927.0692
AIC(regincome1)
## [1] 10970.58
BIC(regincome1)
## [1] 11037.85
pred2 <- predict(regincome1, newdata = test)
pred2
## 1 2 3 4 5 6 7
## 9066.617 9048.732 2871.775 9013.207 5668.301 2560.202 2934.077
## 8 9 10 11 12 13 14
## 5730.115 5908.232 5854.821 5775.123 5783.526 2867.074 9102.143
## 15 16 17 18 19 20 21
## 6103.990 19792.003 8781.679 6068.464 2867.074 2403.017 8835.089
## 22 23 24 25 26 27 28
## 8835.089 9529.428 2386.378 6015.054 13357.364 6317.632 5775.123
## 29 30 31 32 33 34 35
## 16300.988 6148.998 8995.321 5569.883 2818.364 2764.953 6095.587
## 36 37 38 39 40 41 42
## 4465.569 6015.054 5218.019 12858.783 4412.158 2760.253 13517.596
## 43 44 45 46 47 48 49
## 5801.411 2386.378 4683.912 16658.522 2658.132 8781.679 9013.207
## 50 51 52 53 54 55 56
## 19508.610 2867.074 9574.436 2386.378 12698.551 2978.596 18990.843
## 57 58 59 60 61 62 63
## 2818.364 2604.721 13117.433 6015.054 12858.783 6103.990 19151.075
## 64 65 66 67 68 69 70
## 13008.704 2563.249 9120.028 5890.347 2604.721 12848.472 5676.704
## 71 72 73 74 75 76 77
## 2653.431 16071.005 2386.378 5783.526 4321.975 5836.936 2658.132
## 78 79 80 81 82 83 84
## 2813.664 5569.883 4732.622 4518.980 12741.650 2600.021 2871.775
## 85 86 87 88 89 90 91
## 6015.054 5454.659 5164.608 16300.988 9048.732 19471.539 5676.704
## 92 93 94 95 96 97 98
## 12645.140 5801.411 5004.376 5668.301 9529.428 6050.579 2818.364
## 99 100 101 102 103 104 105
## 19188.146 4732.622 2764.953 4683.912 16925.576 5997.168 2706.842
## 106 107 108 109 110 111 112
## 16728.273 8835.089 2973.896 4950.965 2546.610 5775.123 2827.256
## 113 114 115 116 117 118 119
## 2871.775 5783.526 4465.569 2871.775 5676.704 2978.596 2296.195
## 120 121 122 123 124 125 126
## 8986.918 2403.017 2706.842 2600.021 2925.185 5561.480 2871.775
## 127 128 129 130 131 132 133
## 9360.793 2604.721 6228.696 3085.417 8531.936 2720.434 4532.572
## 134 135 136 137 138 139 140
## 2546.610 2706.842 2871.775 5783.526 5997.168 2604.721 5569.883
## 141 142 143 144 145 146 147
## 2706.842 5561.480 5057.787 2776.891 2349.606 16925.576 5730.115
## 148 149 150 151 152 153 154
## 8773.276 7530.725 2760.253 12634.829 9760.956 9048.732 6264.222
## 155 156 157 158 159 160 161
## 2439.789 8781.679 2818.364 12651.169 5801.411 4465.569 8995.321
## 162 163 164 165 166 167 168
## 2493.199 4465.569 17139.219 5569.883 3085.417 2818.364 6442.339
## 169 170 171 172 173 174 175
## 2925.185 4532.572 2711.543 2403.017 4897.555 4683.912 9066.617
## 176 177 178 179 180 181 182
## 5961.643 2760.253 2818.364 2386.378 2453.381 12384.115 5988.765
## 183 184 185 186 187 188 189
## 2764.953 2349.606 16461.220 6264.222 4465.569 3085.417 6050.579
## 190 191 192 193 194 195 196
## 9360.793 2764.953 2773.845 5463.061 15697.130 12651.169 19829.074
## 197 198 199 200 201 202 203
## 3032.007 16194.166 4839.444 9093.740 5569.883 2658.132 2706.842
## 204 205 206 207 208 209 210
## 2439.789 2813.664 19578.360 2332.967 5569.883 4465.569 2386.378
## 211 212 213 214 215
## 5463.061 2867.074 5694.589 13019.015 5836.936
noinc <- read_csv("CaseStudy2CompSet No Salary (2).csv")
noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)
noinc
preds <-predict(regincome1, newdata = noinc)
preddf <- data.frame(predicted = preds, ID = noinc$ID)
preddf <- preddf %>% arrange(ID)
write.csv(preddf, "Case2PredictionsSavorgnanSalary.csv")
library(readr)
noatt <- read_csv("CaseStudy2CompSet No Attrition.csv")
noatt[c("StockOptionLevel", "JobLevel", "OverTime")] <- lapply(noatt[c("StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(noatt)
test_preda <- predict(fit.nb1, newdata = noatt)
test_preda
## [1] No No No No No No No Yes No No Yes Yes No No Yes No No
## [18] No No No Yes No Yes No No No No Yes No Yes Yes No No No
## [35] Yes No No No No No Yes Yes No No Yes No No No Yes No No
## [52] Yes No No No No Yes No No Yes No No Yes No No No No No
## [69] No Yes No No No No No Yes No Yes Yes No No No No No No
## [86] No No No No No No No Yes No No No No No Yes Yes No No
## [103] Yes No No No No No Yes No No No No No No No Yes Yes Yes
## [120] No No Yes No No No No Yes No No No No No No No No No
## [137] No No Yes No No No Yes Yes Yes No No Yes Yes No No No No
## [154] No Yes No Yes Yes Yes No No No Yes No No Yes No No Yes No
## [171] Yes Yes No Yes No No No No No No No No No No No No No
## [188] Yes No No Yes No No No Yes Yes Yes No No No No No Yes No
## [205] No Yes No No No No Yes No Yes No No Yes No No No No Yes
## [222] No No No No Yes Yes No Yes No Yes Yes No Yes Yes Yes Yes No
## [239] No No Yes Yes No Yes No No Yes Yes No Yes Yes No No Yes No
## [256] No No Yes Yes No No No No No No No No No Yes No No No
## [273] No Yes Yes Yes No No No Yes No No No No No Yes Yes Yes Yes
## [290] Yes No No No No Yes Yes No Yes No No
## Levels: No Yes
predi <- data.frame(predicted = test_preda, ID = noatt$ID)
predited <- predi %>% arrange(ID)
write.csv(predited,"Case2PredictionsSavorgnanAttrition.csv")